{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "73b8f8af-0af7-4b2b-afd7-d213d490bd2c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "Bad key savefig.frameon in file /workspace/.conda/envs/tbip/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle, line 421 ('savefig.frameon : True')\n",
      "You probably need to get an updated matplotlibrc file from\n",
      "https://github.com/matplotlib/matplotlib/blob/v3.3.4/matplotlibrc.template\n",
      "or from the matplotlib source distribution\n",
      "\n",
      "Bad key verbose.level in file /workspace/.conda/envs/tbip/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle, line 472 ('verbose.level  : silent      # one of silent, helpful, debug, debug-annoying')\n",
      "You probably need to get an updated matplotlibrc file from\n",
      "https://github.com/matplotlib/matplotlib/blob/v3.3.4/matplotlibrc.template\n",
      "or from the matplotlib source distribution\n",
      "\n",
      "Bad key verbose.fileo in file /workspace/.conda/envs/tbip/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle, line 473 ('verbose.fileo  : sys.stdout  # a log filename, sys.stdout or sys.stderr')\n",
      "You probably need to get an updated matplotlibrc file from\n",
      "https://github.com/matplotlib/matplotlib/blob/v3.3.4/matplotlibrc.template\n",
      "or from the matplotlib source distribution\n",
      "In /workspace/.conda/envs/tbip/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: \n",
      "The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.\n",
      "In /workspace/.conda/envs/tbip/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: \n",
      "The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.\n",
      "In /workspace/.conda/envs/tbip/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: Support for setting the 'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed two minor releases later; use 'mathtext.fallback : 'cm' instead.\n",
      "In /workspace/.conda/envs/tbip/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: \n",
      "The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will be removed two minor releases later.\n",
      "In /workspace/.conda/envs/tbip/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: \n",
      "The savefig.jpeg_quality rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.\n",
      "In /workspace/.conda/envs/tbip/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: \n",
      "The keymap.all_axes rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.\n",
      "In /workspace/.conda/envs/tbip/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: \n",
      "The animation.avconv_path rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.\n",
      "In /workspace/.conda/envs/tbip/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: \n",
      "The animation.avconv_args rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "from scipy.stats import pearsonr, spearmanr\n",
    "import scipy.sparse as sparse\n",
    "from scipy.stats import bernoulli, poisson\n",
    "import analysis_utils_mine as utils\n",
    "\n",
    "import json\n",
    "import pandas as pd\n",
    "import ast\n",
    "from datetime import datetime\n",
    "import torch\n",
    "import pandas as pd\n",
    "from datetime import datetime, timedelta\n",
    "import pickle\n",
    "\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "add218dd-7690-448d-881c-f0a5f58f0554",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7e8fc0f0-2ba7-4d06-8f15-8208fa2ceb71",
   "metadata": {},
   "outputs": [],
   "source": [
    "# floor speeches\n",
    "\n",
    "project_dir = os.path.abspath('../../data/floor_speeches_congs_115_116/') \n",
    "fit_dir = os.path.join(project_dir, \"mallet_fits_removed_procedural_speeches\")\n",
    "\n",
    "data_dir = os.path.join(project_dir, \"clean_removing_procedural\")\n",
    "(_, vocabulary_speeches, _, \n",
    " _) = utils.load_text_data(data_dir)\n",
    "\n",
    "#print(len(vocabulary_speeches))\n",
    "\n",
    "topic_word_speeches = np.load(os.path.join(fit_dir, \n",
    "                                           'topic_word.npy'))\n",
    "#print(topic_word_speeches.shape)\n",
    "doc_topic_speeches = np.load(os.path.join(fit_dir, \n",
    "                                          'doc_topic.npy'))\n",
    "#print(doc_topic_speeches.shape)\n",
    "raw_texts_speeches = open(os.path.join(data_dir, 'raw_documents.txt')).readlines()\n",
    "raw_texts_speeches = list(map(lambda x:x.rstrip(), raw_texts_speeches))\n",
    "#print(len(raw_texts_speeches))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "8feecbee-8d1f-4509-b92a-aa1753bf7e1f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# twitter\n",
    "\n",
    "project_dir = os.path.abspath('../../data/tweets_cong_115_116/') \n",
    "fit_dir = os.path.join(project_dir, \"mallet_results/tbip_expanded_preprocessing_k50\")\n",
    "\n",
    "data_dir = os.path.join(project_dir, \"clean2\")\n",
    "(_, vocabulary_tweets, _, \n",
    " _) = utils.load_text_data(data_dir)\n",
    "\n",
    "#print(len(vocabulary_tweets))\n",
    "\n",
    "topic_word_tweets = np.load(os.path.join(fit_dir, \n",
    "                                           'topic_word.npy'))\n",
    "#print(topic_word_tweets.shape)\n",
    "doc_topic_tweets = np.load(os.path.join(fit_dir, \n",
    "                                          'doc_topic.npy'))\n",
    "#print(doc_topic_tweets.shape)\n",
    "raw_texts_tweets = open(os.path.join(data_dir, 'raw_documents.txt')).readlines()\n",
    "raw_texts_tweets = list(map(lambda x:x.rstrip(), raw_texts_tweets))\n",
    "#print(len(raw_texts_tweets))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "3a84789b-7294-46a0-9a84-e613054d7984",
   "metadata": {},
   "outputs": [],
   "source": [
    "vocabulary_tweets = list(vocabulary_tweets)\n",
    "vocabulary_speeches = list(vocabulary_speeches)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "54367636-00d8-447c-90b7-cc9d43fd16f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "def rescale_to_probs_renorm(arr):\n",
    "    return arr/arr.sum(1, keepdims=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "64053678-7bd4-4a51-bddd-dd59a6d419ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "# rescaling to make them probs \n",
    "topic_word_speeches = rescale_to_probs_renorm(topic_word_speeches)\n",
    "#print(topic_word_speeches.shape)\n",
    "\n",
    "doc_topic_speeches = rescale_to_probs_renorm(doc_topic_speeches)\n",
    "#print(doc_topic_speeches.shape)\n",
    "\n",
    "topic_word_tweets = rescale_to_probs_renorm(topic_word_tweets)\n",
    "#print(topic_word_tweets.shape)\n",
    "\n",
    "doc_topic_tweets = rescale_to_probs_renorm(doc_topic_tweets)\n",
    "#print(doc_topic_tweets.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "7340cfaf-f52c-438e-840a-a68d2e798aa7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_doc_topic_file_for_annotators(doc_topic,\n",
    "                                         raw_texts,\n",
    "                                         outpath,\n",
    "                                         top_doc_num_per_topic=500):\n",
    "    out_df = pd.DataFrame()\n",
    "    num_docs, num_topics = doc_topic.shape\n",
    "    all_top_doc_inds = set()\n",
    "    for topic_ind in range(num_topics):\n",
    "        topic_vals = list(enumerate(list(doc_topic[:, topic_ind])))\n",
    "        topic_vals = sorted(topic_vals, key=lambda x:x[1])[::-1][:top_doc_num_per_topic]\n",
    "        for ind, _ in topic_vals:\n",
    "            all_top_doc_inds.add(ind)\n",
    "    all_top_doc_inds = list(all_top_doc_inds)\n",
    "    selected_raw_texts = [x for i,x in enumerate(raw_texts) if i in all_top_doc_inds]\n",
    "    #print(len(all_top_doc_inds))\n",
    "    out_df['docID'] = [i + 1 for i in range(len(all_top_doc_inds))]\n",
    "    for topic_ind in range(num_topics):\n",
    "        out_df['Topic ' + str(topic_ind + 1)] = list(doc_topic[all_top_doc_inds, topic_ind])\n",
    "    out_df['text'] = selected_raw_texts\n",
    "    \n",
    "    out_df.to_excel(os.path.join(outpath, 'document_topics.xlsx'), index=False, float_format='%.3f')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "c938c371-de88-45b8-8e90-c2ae80c62f26",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "24781\n"
     ]
    }
   ],
   "source": [
    "create_doc_topic_file_for_annotators(doc_topic_speeches, \n",
    "                                     raw_texts_speeches,\n",
    "                                     'venue_diff_polsci/floor_speeches/mallet_output_labeling/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "f0c13d2d-7fd4-44a2-b099-e050d594988e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "50000\n"
     ]
    }
   ],
   "source": [
    "create_doc_topic_file_for_annotators(doc_topic_tweets,\n",
    "                                     raw_texts_tweets,\n",
    "                                     'venue_diff_polsci/twitter/mallet_output_labeling/',\n",
    "                                     1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d0827c94-77b4-4fab-b57f-a4b7ed0cb1b4",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "8436add1-d71d-462e-a4e9-6dd9e9b4db8e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_topics_file_for_human_labeling_and_rating(topic_word,\n",
    "                                                     vocab,\n",
    "                                                     outpath,\n",
    "                                                     num_top_words = 30):\n",
    "    out_df = pd.DataFrame(columns = ['Topic',\n",
    "                                     '',\n",
    "                                     'Coherence',\n",
    "                                     'Polarization',\n",
    "                                     'Topic Name',\n",
    "                                     'Description',\n",
    "                                     'Notes/Comments'])\n",
    "    num_topics, num_words = topic_word.shape\n",
    "    topics_l = []\n",
    "    topics_probs = []\n",
    "    for k in range(num_topics):\n",
    "        topics_l.append('')\n",
    "        topics_probs.append(np.nan)\n",
    "        topics_l.append('Topic ' + str(k+1))\n",
    "        topics_probs.append(np.nan)\n",
    "#         num_top_words = 0\n",
    "#         sum_so_far = 0.0\n",
    "#         z = sorted(topic_word[k])[::-1]\n",
    "#         for p in z:\n",
    "#             sum_so_far += p\n",
    "#             num_top_words += 1\n",
    "#             topics_probs.append(p)\n",
    "#             if sum_so_far >= prob_thresh:\n",
    "#                 break\n",
    "        top_word_inds = np.argsort(list(topic_word[k]))[::-1][:num_top_words]\n",
    "        top_words = [vocab[i] for i in top_word_inds]\n",
    "        top_word_probs = [topic_word[k][i] for i in top_word_inds]\n",
    "        topics_l += top_words\n",
    "        topics_probs += top_word_probs\n",
    "    out_df['Topic'] = topics_l\n",
    "    out_df[''] = topics_probs\n",
    "    out_df.to_excel(os.path.join(outpath, 'topics_for_annotation.xlsx'), index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "d8e2e73a-9b58-4d80-a6fe-6ed2284164a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "create_topics_file_for_human_labeling_and_rating(topic_word_speeches,\n",
    "                                                 vocabulary_speeches,\n",
    "                                                 'venue_diff_polsci/floor_speeches/mallet_output_labeling/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "4bb847db-e25f-4f78-b54e-5eea5bf12600",
   "metadata": {},
   "outputs": [],
   "source": [
    "create_topics_file_for_human_labeling_and_rating(topic_word_tweets,\n",
    "                                                 vocabulary_tweets,\n",
    "                                                 'venue_diff_polsci/twitter/mallet_output_labeling/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "f01b72d5-6739-45f6-876a-b9e8d98d6604",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_word_topic_file_for_generating_clouds(topic_word,\n",
    "                                                 vocab,\n",
    "                                                 outpath):\n",
    "    out_df = pd.DataFrame()\n",
    "    out_df['Word'] = vocab\n",
    "    for k in range(topic_word.shape[0]):\n",
    "        out_df['Topic ' + str(k+1)] = list(topic_word[k, :])\n",
    "    \n",
    "    out_df.to_csv(os.path.join(outpath, 'word_topics_file.csv'), index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "3cc35e72-a911-4b35-8b4f-15c6e84a71ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "create_word_topic_file_for_generating_clouds(topic_word_speeches,\n",
    "                                             vocabulary_speeches,\n",
    "                                             'venue_diff_polsci/floor_speeches/mallet_output_labeling/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "a2f6c9d0-923a-438f-9cc5-311ef360cef4",
   "metadata": {},
   "outputs": [],
   "source": [
    "create_word_topic_file_for_generating_clouds(topic_word_tweets,\n",
    "                                             vocabulary_tweets,\n",
    "                                             'venue_diff_polsci/twitter/mallet_output_labeling/')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "711c633c-eaa9-4ee2-b598-a9342f0dfd1e",
   "metadata": {},
   "source": [
    "### Run python scripts below to generate word clouds (note that we do not use word clouds in our final topic curation process, and instead use excel bars derived from same topic-word probability values (topic_probs) in the topics_for_annotation files created and saved above)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "8b940804-de6f-481d-99f6-06ea3c6355a1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Creating cloud PDFs\n",
      "100%|███████████████████████████████████████████| 51/51 [00:17<00:00,  2.90it/s]\n",
      "Merging clouds into /workspace/pranav/tbip/analysis/human_annotation_files/venue_diff_polsci/floor_speeches/mallet_output_labeling/clouds.pdf\n",
      "100%|██████████████████████████████████████████| 50/50 [00:00<00:00, 438.87it/s]\n"
     ]
    }
   ],
   "source": [
    "!python ../topwords.py -o /workspace/pranav/tbip/analysis/human_annotation_files/venue_diff_polsci/floor_speeches/mallet_output_labeling -i /workspace/pranav/tbip/analysis/human_annotation_files/venue_diff_polsci/floor_speeches/mallet_output_labeling/word_topics_file.csv\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "d01791dc-ec6a-4670-9b04-05476576f936",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Creating cloud PDFs\n",
      "100%|███████████████████████████████████████████| 51/51 [00:17<00:00,  2.87it/s]\n",
      "Merging clouds into /workspace/pranav/tbip/analysis/human_annotation_files/venue_diff_polsci/twitter/mallet_output_labeling/clouds.pdf\n",
      "100%|██████████████████████████████████████████| 50/50 [00:00<00:00, 415.47it/s]\n"
     ]
    }
   ],
   "source": [
    "!python ../topwords.py -o /workspace/pranav/tbip/analysis/human_annotation_files/venue_diff_polsci/twitter/mallet_output_labeling -i /workspace/pranav/tbip/analysis/human_annotation_files/venue_diff_polsci/twitter/mallet_output_labeling/word_topics_file.csv\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:tbip] *",
   "language": "python",
   "name": "conda-env-tbip-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
